import os
import sys
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
sys.path.append('../')
from functionality import funs
from sklearnex import patch_sklearn
patch_sklearn()
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import ExtraTreeClassifier, DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, LabelEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import recall_score, precision_score, accuracy_score, roc_auc_score, make_scorer, classification_report
# Set the random generators for reproducibility.
os.environ['PYTHONHASHSEED']= str(2124)
# Set a custom color palette:
colors = ['red','darksalmon','olive','darkseagreen','dodgerblue','navy']
color = ['maroon','red','tomato','darksalmon','firebrick',
'darkseagreen','seagreen','lightseagreen','olive','green',
'dodgerblue','deepskyblue','navy','blue','royalblue']
my_palette = sns.color_palette(color)
sns.set_palette(my_palette)
working = os.getcwd()
dirname = os.path.dirname(working)
Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
The data was collected from Turkish students at two faculties: Faculty of Engineering and Faculty of Educational Sciences students in 2019.
The goal is to create an ML model that can predict student performance given the data taken from a survey.
The grades are in categorical –AA, BA, BB, CB, CC, DC, DD, and Fail– hence it should be model as a mutli-class classification.
Data Set Information
The data contains results from a survey with columns 1-10 relate to personal questions, 11-16 are family related, and the remaining questions include education habits.
# Load data.
data = pd.read_csv('../data/data.csv', dtype={'Course ID':object})
ages = data.Age.sort_values().unique()
grades = data.Grade.sort_values().unique()
hours = ['none','<5 hours','6-10 hours','11-20 hours','more than 20 hours']
scholarship = data.Scholarship.sort_values().unique()
notes = ['always', 'sometimes', 'never']
listening = ['always', 'sometimes', 'never']
attendance = ['always', 'sometimes']
exams1 = data['Preparation to Midterm Exams 1'].sort_values().unique()
exams2 = data['Preparation to Midterm Exams 2'].sort_values().unique()
fathers = ['Ph.D.','MSc.','university','high school','primary school','secondary school']
mothers = ['Ph.D.','MSc.','university','high school','primary school','secondary school']
courses = data['Course ID'].sort_values().unique()
transport = data['Transportation to University'].unique()
courses = data['Course ID'].sort_values().unique()
props = (data.groupby('Grade')['Grade']
.count().to_frame('Count')
.reset_index())
fig = make_subplots(
rows=1,
cols=1,
specs=[[{'type':'domain'}]])
fig.add_trace(
go.Pie(
labels=props['Grade'],
values=props['Count'],
marker={'colors':color[0:len(grades)+1]},
sort=False),
1, 1)
fig.update_traces(textposition='inside', textinfo='percent+label')
# Update layout settings for the figure.
fig.update_layout(
title={'text':'Grades % Distribution','font_size':20},
showlegend=False,
height=650,
width=1650,
template='plotly_white')
The outcome data –the grades– shows an imbalanced distribution. Whilst DD has 25% of the data, BA and CB have less than 10% and Fail represents only 5.5% of the whole data –only eight points–. This eventually will present a problem as the model will have few data points to train on predicting the Fail grade, but more data points to train the model on predicting the DD grade.
fig = px.bar((data
.pivot_table(index='Grade', columns='Course ID', values='Student ID', fill_value=0, aggfunc='count')
.unstack()
.to_frame('Count')
.reset_index()),
x='Course ID',
y='Count',
color='Grade',
category_orders={'Grade': grades},
labels={'Count':'# Students'},
color_discrete_map=dict(zip(grades, np.flip(color)[0:len(grades)])),
)
fig.update_layout(
showlegend=True,
height=500,
width=1650,
template='plotly_white',
title='Grades by Course ID',
yaxis_range = [0,70])
fig.show()
funs.eda_plotter(data, 'Age', ages, facet_col='Sex')
funs.eda_plotter(data, 'Weekly Study Hours', hours)
funs.eda_plotter(data, 'Scholarship', scholarship)
funs.eda_plotter(data, ['Attendance to Classes','Listening in Classes','Taking Notes in Classes'], [attendance,listening,notes])
funs.eda_plotter(data, ['Preparation to Midterm Exams 1','Preparation to Midterm Exams 2'], [exams1, exams2])
funs.eda_plotter(data, 'Fathers Education', fathers)
funs.eda_plotter(data, 'Mothers Education', fathers)
# Filter data with one apereance.
data = data.loc[data['Scholarship'] != 'None']
data = data.loc[data['Transportation to University'] != 'bicycle']
data = data.loc[data['Accommodation Type'] != 'other']
data = data.loc[data['Fathers Education'] != 'Ph.D.']
# Create the X matrix and y outcome.
y = data['Grade']
X = data.drop(['Student ID','Grade'], axis=1)
# Get the classes from the outcome.
classes = y.sort_values().unique()
Error, Data Transformation, K-Fold and Metrics
# Create a transformer to one hot encode categorical variables.
Transformer = make_column_transformer(
(OneHotEncoder(sparse_output=False), make_column_selector(dtype_include=object)),
remainder="passthrough")
# Create a stratified shuffled split.
sss = StratifiedShuffleSplit(3, test_size=0.1, random_state=6064)
# Set the list of metrics to asses the models' performances.
metrics = {'accuracy':make_scorer(accuracy_score, greater_is_better=True),
'precision_macro':make_scorer(precision_score, greater_is_better=True, average='macro', zero_division=0),
'recall_macro':make_scorer(recall_score, greater_is_better=True, average='macro', zero_division=0),
'auc': make_scorer(roc_auc_score, greater_is_better=True, average='macro', needs_proba=True, multi_class='ovr', labels=classes)}
Precision and recall provide insights into the model's performance for each class individually, while accuracy gives an overall view of the model's correctness. Since this is a multi-class classification problem, precision and recall are calculated individually for each class and then averaged.
Precision: measures the proportion of correctly predicted grades out of all grades predicted as a specific grade. In this case, when predicting an AA grade what proportion of all predicted AA grades where truly AA grades. The procedure is repeated for each individual grade. High precision indicates that the model is good at correctly identifying a specific grade without misclassifying with the other grades. However, it doesn't consider the case when a grade was not predicted as the real grade.
Recall: measures the proportion of correctly predicted grades out of all actual grades in the set. In this case, when predicting an AA grade what proportion of all AA grades were predicted as AA grades. The procedure is repeated for each individual grade. High recall indicates that the models good at predicting most of the grades from each category to its real category.
Accuracy: measures the overall correctness of the model's predictions across all grades. It calculates the proportion of correctly predicted grades out of the total number of grades. It provides an overall assessment of the model's performance, considering both correct predictions for identifying the real and false grade category. However, it may not be the most informative metric when dealing with imbalanced datasets, where the number of instances in each class varies significantly.
# Create a label binarizer fitted using y.
binarizer = LabelBinarizer().fit(y)
# Create a label encoder fitted using y.
encoder = LabelEncoder().fit(y)
Train and Test Subsets
Since the data is imbalanced when splitting to the train and test sets the imbalance has to taken into account. The even split is needed so that the data can train using all possible outcomes – with a distribution comparable to the expected in none seen data.
# Create a train and test set for X and y. Set test size to 20% of the data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=1234, stratify=data[['Grade']])
weights = funs.grades_distribution([y_train, y_test], ['Train Set', 'Test Set'])
The following list shows the estimators –and their parameters– that are studied to identify the best possible model:
log = LogisticRegression(penalty=None, random_state=6064, solver='saga', max_iter=7500, multi_class='multinomial', n_jobs=-1)
l1 = LogisticRegression(penalty='l1', random_state=6064, solver='saga', max_iter=7500, multi_class='multinomial', n_jobs=-1)
l2 = LogisticRegression(penalty='l2', random_state=6064, solver='sag', max_iter=10500, multi_class='multinomial', n_jobs=-1)
net = LogisticRegression(penalty='elasticnet', random_state=6064, solver='saga', max_iter=10500, multi_class='multinomial', n_jobs=-1, l1_ratio=0.5)
sgd = SGDClassifier(loss='modified_huber', penalty=None, max_iter=7500, n_jobs=-1, random_state=6064)
mlp = MLPClassifier(solver='adam', max_iter=4500, random_state=6064)
dtc = DecisionTreeClassifier(random_state=6064)
rfc = RandomForestClassifier(random_state=6064, n_jobs=1)
etc = ExtraTreeClassifier(random_state=6064)
ets = ExtraTreesClassifier(random_state=6064, n_jobs=1)
abc = AdaBoostClassifier(random_state=6064)
gpc = GaussianProcessClassifier(kernel=RBF(0.05), random_state=6064, n_jobs=1)
gbc = GradientBoostingClassifier(loss='log_loss', random_state=6064)
svc = SVC(kernel=RBF(), probability=True)
log = LogisticRegression(penalty=None, random_state=6064, solver='saga', max_iter=7500, multi_class='multinomial', n_jobs=-1)
l1 = LogisticRegression(penalty='l1', random_state=6064, solver='saga', max_iter=7500, multi_class='multinomial', n_jobs=-1)
l2 = LogisticRegression(penalty='l2', random_state=6064, solver='sag', max_iter=10500, multi_class='multinomial', n_jobs=-1)
net = LogisticRegression(penalty='elasticnet', random_state=6064, solver='saga', max_iter=10500, multi_class='multinomial', n_jobs=-1, l1_ratio=0.5)
sgd = SGDClassifier(loss='modified_huber', penalty=None, max_iter=7500, n_jobs=-1, random_state=6064)
mlp = MLPClassifier(solver='adam', max_iter=4500, random_state=6064)
dtc = DecisionTreeClassifier(random_state=6064)
rfc = RandomForestClassifier(random_state=6064, n_jobs=1)
etc = ExtraTreeClassifier(random_state=6064)
ets = ExtraTreesClassifier(random_state=6064, n_jobs=1)
abc = AdaBoostClassifier(random_state=6064)
gpc = GaussianProcessClassifier(kernel=RBF(0.05), random_state=6064, n_jobs=1)
gbc = GradientBoostingClassifier(loss='log_loss', random_state=6064)
svc = SVC(kernel=RBF(), probability=True)
validation = []
estimators = [log, l1, l2, net, sgd, mlp, dtc, rfc, etc, abc, ets, gpc, gbc, svc]
train, validate = funs.cv_models_performance(estimators, Transformer, X_train, y_train, metrics, sss)
funs.performance_plotter(train, validate, 'Validation', color)
validation.append(validate)
validate.style.hide(axis='index')
| Model | Accuracy | Recall weighted | Precision weighted | AUC |
|---|---|---|---|---|
| logisticregression | 0.250000 | 0.229200 | 0.188900 | 0.625700 |
| logisticregression_l1 | 0.333300 | 0.291700 | 0.223600 | 0.714700 |
| logisticregression_l2 | 0.250000 | 0.194400 | 0.156500 | 0.644300 |
| logisticregression_elasticnet | 0.305600 | 0.263900 | 0.210800 | 0.683800 |
| sgd | 0.277800 | 0.250000 | 0.193500 | 0.551800 |
| mlp | 0.361100 | 0.312500 | 0.241300 | 0.635900 |
| decisiontree | 0.166700 | 0.131900 | 0.103500 | 0.503500 |
| randomforest | 0.416700 | 0.347200 | 0.291700 | 0.695700 |
| extratree | 0.138900 | 0.152800 | 0.121500 | 0.512600 |
| adaboost | 0.250000 | 0.159700 | 0.084600 | 0.607700 |
| extratrees | 0.416700 | 0.375000 | 0.343100 | 0.723000 |
| gaussianprocess | 0.083300 | 0.125000 | 0.010400 | 0.500000 |
| gradientboosting | 0.388900 | 0.395800 | 0.309000 | 0.661400 |
| svc | 0.250000 | 0.125000 | 0.031200 | 0.326000 |
train, validate = funs.cv_models_performance(estimators, Transformer, X_train, y_train, metrics, sss, variance_threshold=0.10)
funs.performance_plotter(train, validate, 'Validation', color)
validation.append(validate)
validate.style.hide(axis='index')
| Model | Accuracy | Recall weighted | Precision weighted | AUC |
|---|---|---|---|---|
| logisticregression | 0.277800 | 0.243100 | 0.214600 | 0.632400 |
| logisticregression_l1 | 0.333300 | 0.284700 | 0.229900 | 0.716900 |
| logisticregression_l2 | 0.277800 | 0.222200 | 0.199300 | 0.637900 |
| logisticregression_elasticnet | 0.277800 | 0.201400 | 0.165300 | 0.673600 |
| sgd | 0.250000 | 0.180600 | 0.194400 | 0.572400 |
| mlp | 0.250000 | 0.250000 | 0.156200 | 0.600300 |
| decisiontree | 0.250000 | 0.284700 | 0.211800 | 0.586200 |
| randomforest | 0.250000 | 0.222200 | 0.160100 | 0.660200 |
| extratree | 0.194400 | 0.138900 | 0.116000 | 0.510100 |
| adaboost | 0.194400 | 0.201400 | 0.168500 | 0.605000 |
| extratrees | 0.250000 | 0.208300 | 0.194400 | 0.706400 |
| gaussianprocess | 0.083300 | 0.125000 | 0.010400 | 0.500000 |
| gradientboosting | 0.305600 | 0.291700 | 0.215300 | 0.661900 |
| svc | 0.250000 | 0.125000 | 0.031200 | 0.333800 |
train, validate = funs.cv_models_performance(estimators, Transformer, X_train, y_train, metrics, sss, variance_threshold=0.2)
funs.performance_plotter(train, validate, 'Validation', color)
validation.append(validate)
validate.style.hide(axis='index')
| Model | Accuracy | Recall weighted | Precision weighted | AUC |
|---|---|---|---|---|
| logisticregression | 0.416700 | 0.354200 | 0.354200 | 0.700400 |
| logisticregression_l1 | 0.305600 | 0.250000 | 0.219200 | 0.677200 |
| logisticregression_l2 | 0.277800 | 0.277800 | 0.213200 | 0.675700 |
| logisticregression_elasticnet | 0.250000 | 0.215300 | 0.173600 | 0.703400 |
| sgd | 0.222200 | 0.194400 | 0.135400 | 0.582000 |
| mlp | 0.333300 | 0.298600 | 0.183300 | 0.651500 |
| decisiontree | 0.250000 | 0.180600 | 0.152800 | 0.534000 |
| randomforest | 0.277800 | 0.250000 | 0.163200 | 0.674900 |
| extratree | 0.305600 | 0.284700 | 0.181900 | 0.591800 |
| adaboost | 0.222200 | 0.194400 | 0.140300 | 0.737700 |
| extratrees | 0.305600 | 0.243100 | 0.181900 | 0.741300 |
| gaussianprocess | 0.083300 | 0.125000 | 0.010400 | 0.500000 |
| gradientboosting | 0.250000 | 0.236100 | 0.148600 | 0.649300 |
| svc | 0.250000 | 0.125000 | 0.031200 | 0.320800 |
# Create a pipeline with data transformation and variance threshold.
preprocessor = make_pipeline(Transformer, VarianceThreshold())
if not 'logisticregression.joblib' in os.listdir('../working/best_estimators'):
log_cv = GridSearchCV(
estimator = make_pipeline(preprocessor, log),
param_grid={
'pipeline__variancethreshold__threshold':np.arange(0.05, 0.25,0.025),
'logisticregression__multi_class':['multinomial','ovr']},
scoring=metrics,
n_jobs=-1,
refit='accuracy',
cv=sss,
return_train_score=True)
_ = log_cv.fit(X_train, y_train)
if not 'logisticregression_l1.joblib' in os.listdir('../working/best_estimators'):
l1_cv = GridSearchCV(
estimator = make_pipeline(preprocessor, l1),
param_grid={
'pipeline__variancethreshold__threshold':np.arange(0.05, 0.25,0.025),
'logisticregression__C':[0.1,0.5,1,5,10,50,100],
'logisticregression__multi_class':['multinomial','ovr']},
scoring=metrics,
n_jobs=-1,
refit='accuracy',
cv=sss,
return_train_score=True)
_ = l1_cv.fit(X_train, y_train)
if not 'logisticregression_l2.joblib' in os.listdir('../working/best_estimators'):
l2_cv = GridSearchCV(
estimator = make_pipeline(preprocessor, l2),
param_grid={
'pipeline__variancethreshold__threshold':np.arange(0.05, 0.25,0.025),
'logisticregression__C':[0.1,0.5,1,5,10,50,100],
'logisticregression__multi_class':['multinomial','ovr']},
scoring=metrics,
n_jobs=-1,
refit='accuracy',
cv=sss,
return_train_score=True)
_ = l2_cv.fit(X_train, y_train)
if not 'logisticregression_elasticnet.joblib' in os.listdir('../working/best_estimators'):
net_cv = GridSearchCV(
estimator = make_pipeline(preprocessor, net),
param_grid={
'pipeline__variancethreshold__threshold':np.arange(0.05, 0.25,0.025),
'logisticregression__C':[0.1,0.5,1,5,10,50,100],
'logisticregression__l1_ratio':np.arange(0.1,1.1,0.1),
'logisticregression__multi_class':['multinomial','ovr']},
scoring=metrics,
n_jobs=-1,
refit='accuracy',
cv=sss,
return_train_score=True)
_ = net_cv.fit(X_train, y_train)
if not 'sgd_l1.joblib' in os.listdir('../working/best_estimators'):
sgd_cv = GridSearchCV(
estimator = make_pipeline(preprocessor, sgd),
param_grid={
'pipeline__variancethreshold__threshold':np.arange(0.05, 0.25,0.025),
'sgdclassifier__loss':['log_loss','modified_huber'],
'sgdclassifier__penalty':['l2', 'l1', 'elasticnet', None],
'sgdclassifier__alpha':np.arange(0.0001,0.11,0.025)},
scoring=metrics,
n_jobs=-1,
refit='accuracy',
cv=sss,
return_train_score=True)
_ = sgd_cv.fit(X_train, y_train)
if not 'mlp.joblib' in os.listdir('../working/best_estimators'):
mlp_cv = GridSearchCV(
estimator = make_pipeline(preprocessor, mlp),
param_grid={
'pipeline__variancethreshold__threshold':np.arange(0.05, 0.25,0.025),
'mlpclassifier__hidden_layer_sizes':[(50,), (100,), (150,), (200,)],
'mlpclassifier__activation':['identity', 'logistic', 'tanh', 'relu'],
'mlpclassifier__alpha':[0.1, 0.05, 0.01, 0.001]},
scoring=metrics,
n_jobs=-1,
refit='accuracy',
cv=sss,
return_train_score=True)
_ = mlp_cv.fit(X_train, y_train)
if not 'decisiontree.joblib' in os.listdir('../working/best_estimators'):
dtc_cv = GridSearchCV(
estimator = make_pipeline(preprocessor, dtc),
param_grid={
'pipeline__variancethreshold__threshold':np.arange(0.1, 0.225,0.025),
'decisiontreeclassifier__criterion':['gini','entropy','log_loss'],
'decisiontreeclassifier__max_depth':np.arange(5, 11),
'decisiontreeclassifier__min_samples_split':np.arange(2, 5),
'decisiontreeclassifier__min_samples_leaf':np.arange(1, 5),
'decisiontreeclassifier__class_weight':[None, weights],
'decisiontreeclassifier__ccp_alpha':np.arange(0.005, 0.035, 0.005)},
scoring=metrics,
n_jobs=-1,
refit='accuracy',
cv=sss,
return_train_score=True)
_ = dtc_cv.fit(X_train, y_train)
if not 'randomforest.joblib' in os.listdir('../working/best_estimators'):
rfc_cv = GridSearchCV(
estimator = make_pipeline(preprocessor, rfc),
param_grid={
'pipeline__variancethreshold__threshold':np.arange(0.1, 0.225,0.025),
'randomforestclassifier__criterion':['gini','entropy','log_loss'],
'randomforestclassifier__max_depth':np.arange(5, 11),
'randomforestclassifier__min_samples_split':np.arange(2, 5),
'randomforestclassifier__min_samples_leaf':np.arange(1, 5),
'randomforestclassifier__class_weight':[None, weights],
'randomforestclassifier__ccp_alpha':np.arange(0.005, 0.035, 0.005)},
scoring=metrics,
n_jobs=-1,
refit='accuracy',
cv=sss,
return_train_score=True)
_ = rfc_cv.fit(X_train, y_train)
if not 'extratree.joblib' in os.listdir('../working/best_estimators'):
etc_cv = GridSearchCV(
estimator = make_pipeline(preprocessor, etc),
param_grid={
'pipeline__variancethreshold__threshold':np.arange(0.1, 0.225,0.025),
'extratreeclassifier__criterion':['gini','entropy','log_loss'],
'extratreeclassifier__max_depth':np.arange(5, 11),
'extratreeclassifier__min_samples_split':np.arange(2, 5),
'extratreeclassifier__min_samples_leaf':np.arange(1, 5),
'extratreeclassifier__class_weight':[None, weights],
'extratreeclassifier__ccp_alpha':np.arange(0.005, 0.035, 0.005)},
scoring=metrics,
n_jobs=-1,
refit='accuracy',
cv=sss,
return_train_score=False)
_ = etc_cv.fit(X_train, y_train)
if not 'extratrees.joblib' in os.listdir('../working/best_estimators'):
ets_cv = GridSearchCV(
estimator = make_pipeline(preprocessor, ets),
param_grid={
'pipeline__variancethreshold__threshold':np.arange(0.1, 0.225,0.025),
'extratreesclassifier__criterion':['gini','entropy','log_loss'],
'extratreesclassifier__n_estimators':np.arange(5, 11),
'extratreesclassifier__max_depth':np.arange(5, 11),
'extratreesclassifier__min_samples_split':np.arange(2, 5),
'extratreesclassifier__min_samples_leaf':np.arange(1, 5),
'extratreesclassifier__class_weight':[None, weights],
'extratreesclassifier__ccp_alpha':np.arange(0.005, 0.035, 0.005)},
scoring=metrics,
n_jobs=-1,
refit='accuracy',
cv=sss,
return_train_score=False)
_ = ets_cv.fit(X_train, y_train)
if not 'adaboost.joblib' in os.listdir('../working/best_estimators'):
abc_cv = GridSearchCV(
estimator = make_pipeline(preprocessor, abc),
param_grid={
'pipeline__variancethreshold__threshold':np.arange(0.1, 0.225,0.025),
'adaboostclassifier__estimator':[dtc_cv.best_estimator_.steps[1][1],
rfc_cv.best_estimator_.steps[1][1],
etc_cv.best_estimator_.steps[1][1],
ets_cv.best_estimator_.steps[1][1]],
'adaboostclassifier__n_estimators':np.arange(10, 110, 10),
'adaboostclassifier__learning_rate':np.arange(0.1,1.1,0.1)},
scoring=metrics,
n_jobs=-1,
refit='accuracy',
cv=sss,
return_train_score=True)
_ = abc_cv.fit(X_train, y_train)
if not 'gaussianprocess.joblib' in os.listdir('../working/best_estimators'):
gpc_cv = GridSearchCV(
estimator = make_pipeline(preprocessor, gpc),
param_grid={
'pipeline__variancethreshold__threshold':np.arange(0.05, 0.25,0.025),
'gaussianprocessclassifier__kernel':[RBF(0.001), RBF(0.005), RBF(0.01), RBF(0.05)]},
scoring=metrics,
n_jobs=-1,
refit='accuracy',
cv=sss,
return_train_score=False)
_ = gpc_cv.fit(X_train, y_train)
if not 'gradientboosting.joblib' in os.listdir('../working/best_estimators'):
gbc_cv = RandomizedSearchCV(
estimator = make_pipeline(preprocessor, gbc),
param_distributions={
'pipeline__variancethreshold__threshold':np.arange(0.05, 0.25, 0.025),
'gradientboostingclassifier__learning_rate':[0.01, 0.05, 0.1, 0.5, 1, 5],
'gradientboostingclassifier__n_estimators':[8,9,10,11,12,13,14,15],
'gradientboostingclassifier__min_samples_split':np.arange(2,6),
'gradientboostingclassifier__min_samples_leaf':np.arange(2,6),
'gradientboostingclassifier__ccp_alpha':np.arange(0.005, 0.035, 0.005)},
n_iter=30,
scoring=metrics,
n_jobs=-1,
refit='accuracy',
cv=sss,
random_state=9597)
_ = gbc_cv.fit(X_train, y_train)
if not 'svc.joblib' in os.listdir('../working/best_estimators'):
svc_cv = GridSearchCV(
estimator = make_pipeline(preprocessor, svc),
param_grid={
'pipeline__variancethreshold__threshold':np.arange(0.05, 0.25,0.025),
'svc__C':np.arange(1,11,1),
'svc__kernel':['rbf','sigmoid']},
scoring=metrics,
n_jobs=-1,
refit='accuracy',
cv=sss,
return_train_score=False)
_ = svc_cv.fit(X_train, y_train)
# Check if the best_estimators folder is empty.
if len(os.listdir(os.path.join(dirname, 'working/best_estimators'))) == 0:
# Create a list with the GridSearchCV best estimators.
best_estimators = [
log_cv.best_estimator_,
l1_cv.best_estimator_,
l2_cv.best_estimator_,
net_cv.best_estimator_,
sgd_cv.best_estimator_,
mlp_cv.best_estimator_,
dtc_cv.best_estimator_,
rfc_cv.best_estimator_,
etc_cv.best_estimator_,
ets_cv.best_estimator_,
abc_cv.best_estimator_,
gpc_cv.best_estimator_,
gbc_cv.best_estimator_,
svc_cv.best_estimator_]
# Save the best estimators to folder.
funs.save_best_estimators(best_estimators)
else:
# Load the best estimators from folder.
best_estimators = funs.load_best_estimators()
train, validate = funs.cv_models_performance(best_estimators, Transformer, X_train, y_train, metrics, sss, best=True)
funs.performance_plotter(train, validate, 'Validation', color)
validation.append(validate)
validate.style.hide(axis='index')
| Model | Accuracy | Recall weighted | Precision weighted | AUC |
|---|---|---|---|---|
| logisticregression | 0.416700 | 0.354200 | 0.354200 | 0.700400 |
| logisticregression_l1 | 0.444400 | 0.375000 | 0.361100 | 0.685400 |
| logisticregression_l2 | 0.388900 | 0.319400 | 0.267400 | 0.699700 |
| logisticregression_elasticnet | 0.444400 | 0.375000 | 0.361100 | 0.691400 |
| sgd_l1 | 0.444400 | 0.340300 | 0.229700 | 0.760700 |
| mlp | 0.416700 | 0.395800 | 0.276400 | 0.678200 |
| decisiontree | 0.444400 | 0.409700 | 0.315300 | 0.671300 |
| randomforest | 0.472200 | 0.395800 | 0.313900 | 0.710800 |
| extratree | 0.444400 | 0.409700 | 0.291700 | 0.723300 |
| adaboost | 0.472200 | 0.444400 | 0.308300 | 0.777200 |
| extratrees | 0.555600 | 0.513900 | 0.465800 | 0.744000 |
| gaussianprocess | 0.083300 | 0.125000 | 0.010400 | 0.500000 |
| gradientboosting | 0.388900 | 0.312500 | 0.152200 | 0.626500 |
| svc | 0.444400 | 0.333300 | 0.236100 | 0.609700 |
names = ['Standard Estimator',
'Variance T (0.1)',
'Variance T (0.2)',
'Best Estimator']
funs.comparison_plotter('Accuracy', validation, names, color)
funs.comparison_plotter('Precision weighted', validation, names, color)
funs.comparison_plotter('Recall weighted', validation, names, color)
funs.comparison_plotter('AUC', validation, names, color)
train, test = funs.models_performance_train_test(best_estimators, Transformer, X_train, y_train, X_test, y_test, classes, best=True)
funs.performance_plotter(train, test, 'Test', color)
test.style.hide(axis='index')
| Model | Accuracy | Recall weighted | Precision weighted | AUC |
|---|---|---|---|---|
| logisticregression | 0.310300 | 0.310300 | 0.316300 | 0.721700 |
| logisticregression_l1 | 0.275900 | 0.275900 | 0.422400 | 0.734400 |
| logisticregression_l2 | 0.344800 | 0.344800 | 0.405400 | 0.720700 |
| logisticregression_elasticnet | 0.344800 | 0.344800 | 0.405400 | 0.737800 |
| sgd_l1 | 0.206900 | 0.206900 | 0.115000 | 0.628700 |
| mlp | 0.137900 | 0.137900 | 0.133600 | 0.627800 |
| decisiontree | 0.206900 | 0.206900 | 0.181000 | 0.486100 |
| randomforest | 0.310300 | 0.310300 | 0.250000 | 0.715900 |
| extratree | 0.172400 | 0.172400 | 0.232200 | 0.566100 |
| adaboost | 0.172400 | 0.172400 | 0.137900 | 0.538100 |
| extratrees | 0.172400 | 0.172400 | 0.187700 | 0.538900 |
| gaussianprocess | 0.069000 | 0.069000 | 0.004800 | 0.500000 |
| gradientboosting | 0.275900 | 0.275900 | 0.171300 | 0.605500 |
| svc | 0.206900 | 0.206900 | 0.298300 | 0.558800 |
# Option 1: Load the model from best_estimators using joblib.
L1 = joblib.load('../working/best_estimators/logisticregression_l2.joblib')
Overall Test Performance Report
funs.classification_report(L1, y_train, X_train, y_test, X_test, classes, roc_plot=False)
TRAIN TEST Accuracy: 0.929 0.345 Recall: 0.929 0.345 Precision: 0.931 0.405 AUC: 0.995 0.721
Test Set Classification Report
print(classification_report(y_test, L1.predict(X_test), zero_division=0))
precision recall f1-score support
AA 1.00 0.33 0.50 3
BA 1.00 0.67 0.80 3
BB 0.67 0.67 0.67 3
CB 0.00 0.00 0.00 2
CC 0.27 0.75 0.40 4
DC 0.33 0.20 0.25 5
DD 0.14 0.14 0.14 7
Fail 0.00 0.00 0.00 2
accuracy 0.34 29
macro avg 0.43 0.34 0.34 29
weighted avg 0.41 0.34 0.34 29
funs.confusion_matrix_plot(L1, (X_train, X_test), (y_train, y_test))
funs.roc_auc_plot(L1, (X_train, X_test), (y_train, y_test), binarizer)
# Create logodds plot by attribute and grade.
logodds = funs.linear_coefficients(L1, 'logisticregression')
# Create probabilities plot by attribute and grade.
probabilities = funs.linear_coefficients(L1, 'logisticregression', proba=True)
# Prepare the DataFrame for plotting.
df = probabilities.melt(id_vars='Variable', var_name='Grades', value_name='Coefficient')
funs.probabilities_by_grade(probabilities, probabilities.columns[0:-1])
# Option 1: Load the model from best_estimators using pickle.
RFC = joblib.load('../working/best_estimators/randomforest.joblib')
_ = (RFC
# .set_params(**extra_params)
.fit(X_train, y_train))
Overall Test Performance Report
funs.classification_report(RFC, y_train, X_train, y_test, X_test, classes, roc_plot=False)
TRAIN TEST Accuracy: 1.000 0.310 Recall: 1.000 0.310 Precision: 1.000 0.250 AUC: 1.000 0.716
Test Set Classification Report
print(classification_report(y_test, RFC.predict(X_test), zero_division=0))
precision recall f1-score support
AA 1.00 0.67 0.80 3
BA 0.00 0.00 0.00 3
BB 0.00 0.00 0.00 3
CB 0.00 0.00 0.00 2
CC 0.17 0.25 0.20 4
DC 0.25 0.20 0.22 5
DD 0.33 0.71 0.45 7
Fail 0.00 0.00 0.00 2
accuracy 0.31 29
macro avg 0.22 0.23 0.21 29
weighted avg 0.25 0.31 0.26 29
funs.confusion_matrix_plot(RFC, (X_train, X_test), (y_train, y_test))
funs.roc_auc_plot(RFC, (X_train, X_test), (y_train, y_test), binarizer)
# Create features importance plot by attribute.
features = funs.tree_importance(RFC, 'randomforestclassifier')
In conclusion, the evaluation of various models reveals their performance on the classification task. The results demonstrate the impact of feature selection and hyperparameter optimization on model performance. The best-performing model, the Logistic Regression with l1 penalization, shows promising results in terms of accuracy, recall, precision, and AUC in comparison to the other classifiers.
Nonetheless, the performance of such model is still poor – given the fact that the tunning process is made for just one model applied to each grade leaving the rest out. If a model per grade is developed and fine-tuned better classification performances can be achieved.